Problem Set 2

# this code clears the environment and installs/loads popular packages

rm(list = ls()) 
  gc()            
##          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 521553 27.9    1155494 61.8         NA   669314 35.8
## Vcells 961026  7.4    8388608 64.0      16384  1840016 14.1
  cat("\f")  
packages <- c("readr", #open csv
              "psych", # quick summary stats for data exploration,
              "stargazer", #summary stats for sharing,
              "tidyverse", # data manipulation like selecting variables,
              "corrplot", # correlation plots
              "ggplot2", # graphing
              "ggcorrplot", # correlation plot
              "gridExtra", #overlay plots
              "data.table", # reshape for graphing 
              "car", #vif
              "prettydoc", # html output
              "visdat", # visualize missing variables
              "glmnet", # lasso/ridge
              "caret", # confusion matrix
              "MASS", #step AIC
              "plm", # fixed effects demeaned regression
              "lmtest", # test regression coefficients
              "fpp3", # Foprecasting: Principles & Practice supplement
              "tsibble", 
              "tsibbledata",
              "lubridate",
              "forecast",
              "seasonal"
)

for (i in 1:length(packages)) {
  if (!packages[i] %in% rownames(installed.packages())) {
    install.packages(packages[i]
                     , repos = "http://cran.rstudio.com/"
                     , dependencies = TRUE
    )
  }
  library(packages[i], character.only = TRUE)
}

rm(packages)

Chapter 2

Exercise 2: Use filter() to find what days corresponded to the peak closing price for each of the four stocks in gafa_stock

data(gafa_stock)
gafa_stock %>%
  group_by(Symbol) %>%
  filter(Close == max(Close)) 
## # A tsibble: 4 x 8 [!]
## # Key:       Symbol [4]
## # Groups:    Symbol [4]
##   Symbol Date        Open  High   Low Close Adj_Close   Volume
##   <chr>  <date>     <dbl> <dbl> <dbl> <dbl>     <dbl>    <dbl>
## 1 AAPL   2018-10-03  230.  233.  230.  232.      230. 28654800
## 2 AMZN   2018-09-04 2026. 2050. 2013  2040.     2040.  5721100
## 3 FB     2018-07-25  216.  219.  214.  218.      218. 58954200
## 4 GOOG   2018-07-26 1251  1270. 1249. 1268.     1268.  2405600

Exercise 3: Download the file tute1.csv from the book website, open it in Excel (or some other spreadsheet application), and review its contents. You should find four columns of information. Columns B through D each contain a quarterly series, labelled Sales, AdBudget and GDP. Sales contains the quarterly sales for a small company over the period 1981-2005. AdBudget is the advertising budget and GDP is the gross domestic product. All series have been adjusted for inflation.

tute1 <- readr::read_csv("tute1.csv")
head(tute1, 4)
## # A tibble: 4 × 4
##   Quarter    Sales AdBudget   GDP
##   <date>     <dbl>    <dbl> <dbl>
## 1 1981-03-01 1020.     659.  252.
## 2 1981-06-01  889.     589   291.
## 3 1981-09-01  795      512.  291.
## 4 1981-12-01 1004.     614.  292.
mytimeseries <- tute1 %>%
  mutate(Quarter = yearquarter(Quarter)) %>%
  as_tsibble(index = Quarter)
mytimeseries %>%
  pivot_longer(-Quarter) %>%
  ggplot(aes(x = Quarter, y = value, colour = name)) +
  geom_line() +
  facet_grid(name ~ ., scales = "free_y")

The exercise prompts us to: Check what happens when you don’t include facet_grid().

mytimeseries %>%
  pivot_longer(-Quarter) %>%
  ggplot(aes(x = Quarter, y = value, colour = name)) +
  geom_line()

The facet_grid()

function allows for the variables to be displayed using their own corresponsing value scales (y axis) while keeping the X axis the same. It shows more detail in the variance of the variables.

Exercise 10: The aus_livestock data contains the monthly total number of pigs slaughtered in Victoria, Australia, from Jul 1972 to Dec 2018. Use filter() to extract pig slaughters in Victoria between 1990 and 1995. Use autoplot() and ACF() for this data. How do they differ from white noise? If a longer period of data is used, what difference does it make to the ACF?

data(aus_livestock)

aus_livestock %>%
  mutate(Month = yearmonth(Month)) %>%
  as_tsibble(index = Month)
## # A tsibble: 29,364 x 4 [1M]
## # Key:       Animal, State [54]
##       Month Animal                     State                        Count
##       <mth> <fct>                      <fct>                        <dbl>
##  1 1976 Jul Bulls, bullocks and steers Australian Capital Territory  2300
##  2 1976 Aug Bulls, bullocks and steers Australian Capital Territory  2100
##  3 1976 Sep Bulls, bullocks and steers Australian Capital Territory  2100
##  4 1976 Oct Bulls, bullocks and steers Australian Capital Territory  1900
##  5 1976 Nov Bulls, bullocks and steers Australian Capital Territory  2100
##  6 1976 Dec Bulls, bullocks and steers Australian Capital Territory  1800
##  7 1977 Jan Bulls, bullocks and steers Australian Capital Territory  1800
##  8 1977 Feb Bulls, bullocks and steers Australian Capital Territory  1900
##  9 1977 Mar Bulls, bullocks and steers Australian Capital Territory  2700
## 10 1977 Apr Bulls, bullocks and steers Australian Capital Territory  2300
## # … with 29,354 more rows
victoria <- aus_livestock %>%
  filter(State == "Victoria",
         Animal == "Pigs",
         year(Month) %in% 1990:1995)
ACF(victoria, Count) %>% 
  autoplot()

There looks to be serious autocorrelation from lags as shown in the ACF. If longer time periods are used:

victoria2 <- aus_livestock %>%
  filter(State == "Victoria",
         Animal == "Pigs",
         year(Month) %in% 1980:2018)
ACF(victoria2, Count) %>% 
  autoplot()

the autocorrelation remains and even appears stronger. This means this is not a white noise dataset and there is some clear seasonal trends at almost all lags.

Chapter 3

Exercise 3: Why is a Box-Cox transformation unhelpful for the canadian_gas data?

autoplot(canadian_gas, Volume)

Since the changes in variation over time are not consistent (not increasing over time, for example) the Box-Cox will be ineffective in standardizing the variation. Let’s plot a Box-Cox transform to show:

lambda <- canadian_gas %>%
  features(Volume, features = guerrero)  %>%
  pull(lambda_guerrero)

canadian_gas %>%
  autoplot(box_cox(Volume,lambda)) 

We see here that there is still inconsistent variance in the data.

Exercise 10: This exercise uses the canadian_gas data (monthly Canadian gas production in billions of cubic metres, January 1960 – February 2005).

(a) Plot the data using autoplot(), gg_subseries() and gg_season() to look at the effect of the changing seasonality over time.
canadian_gas %>%
  autoplot(Volume)+
  labs(title = "Monthly Canadian Gas Production")

canadian_gas %>%
  gg_subseries(Volume)+
  labs(title = "Monthly Canadian Gas Production")

canadian_gas %>%
  gg_season(Volume)+
  labs(title = "Monthly Canadian Gas Production")

(b) Do an STL decomposition of the data. You will need to choose a seasonal window to allow for the changing shape of the seasonal component.
canadian_gas %>%
  model(
    STL(Volume ~ trend(window = 21) + # window = 21 for monthly data
          season(window = "periodic"),
        robust = TRUE)) %>%
  components() %>%
  autoplot()

(c) How does the seasonal shape change over time?

Looking at the seasonal chart above, we see that over time demand still dips in the summer months but is less severe. Demand changes due to the weather and generally follows the same trend.

canadian_gas %>% 
  model(`STL` = STL(Volume ~ trend(window = 7) + season(window = 21))) %>% 
  components() %>% 
  gg_season(season_year) 

(d) Can you produce a plausible seasonally adjusted series?
canadian_gas %>% 
  model(`STL` = STL(Volume ~ trend(window = 7) + season(window = 21))) %>% 
  components() %>% 
  pull(season_adjust) -> canadian_gas_adjusted
canadian_gas %>%
 model(
    STL(Volume ~ trend(window = 21) +
                   season(window = 13),
    robust = TRUE)) %>%
  components() %>%
  ggplot(aes(x = Month)) +
  geom_line(aes(y = Volume, colour = "Data")) +
  geom_line(aes(y = season_adjust,
                colour = "Seasonally Adjusted")) +
  geom_line(aes(y = trend, colour = "Trend")) +
  labs(title = "STL decomposition of Canadian Gas Production") +
  scale_colour_manual(
    values = c("gray", "#0072B2", "#D55E00"),
    breaks = c("Data", "Seasonally Adjusted", "Trend")
  )

(e) Compare the results with those obtained using SEATS and X-11. How are they different?

X-11

x11_dcmp <- canadian_gas %>%
  model(x11 = X_13ARIMA_SEATS(Volume ~ x11())) %>%
  components()
autoplot(x11_dcmp) +
  labs(title =
    "Decomposition of total Canadian Gas using X-11.")

SEATS

seats_dcmp <- canadian_gas %>%
  model(seats = X_13ARIMA_SEATS(Volume ~ seats())) %>%
  components()
autoplot(seats_dcmp) +
  labs(title =
    "Decomposition of Canadian GAs using SEATS")

The irregular series is lower with SEATS. Other trends look similar.